
Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. In order to make sure this underserved population has a positive loan experience, Home Credit makes use of a variety of alternative data--including telco and transactional information--to predict their clients' repayment abilities.
While Home Credit is currently using various statistical and machine learning methods to make these predictions, they're challenging Kagglers to help them unlock the full potential of their data. Doing so will ensure that clients capable of repayment are not rejected and that loans are given with a principal, maturity, and repayment calendar that will empower their clients to be successful.
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np # fundamental package for scientific computing with Python
import matplotlib
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
# from plotly import tools
# import plotly.tools as tls
# import squarify
# from mpl_toolkits.basemap import Basemap
# from numpy import array
# from matplotlib import cm
# import cufflinks and offline mode
import cufflinks as cf
cf.go_offline()
# from sklearn import preprocessing
# # Supress unnecessary warnings so that presentation looks clean
# import warnings
# warnings.filterwarnings("ignore")
# # Print all rows and columns
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
import os
print(os.listdir("../input"))
application_train = pd.read_csv('../input/application_train.csv')
POS_CASH_balance = pd.read_csv('../input/POS_CASH_balance.csv')
bureau_balance = pd.read_csv('../input/bureau_balance.csv')
previous_application = pd.read_csv('../input/previous_application.csv')
installments_payments = pd.read_csv('../input/installments_payments.csv')
credit_card_balance = pd.read_csv('../input/credit_card_balance.csv')
bureau = pd.read_csv('../input/bureau.csv')
application_test = pd.read_csv('../input/application_test.csv')
print('Size of application_train data', application_train.shape)
print('Size of POS_CASH_balance data', POS_CASH_balance.shape)
print('Size of bureau_balance data', bureau_balance.shape)
print('Size of previous_application data', previous_application.shape)
print('Size of installments_payments data', installments_payments.shape)
print('Size of credit_card_balance data', credit_card_balance.shape)
print('Size of bureau data', bureau.shape)
application_train data
application_train.head()
application_train.columns.values
POS_CASH_balance data
POS_CASH_balance.head()
bureau_balance data
bureau_balance.head()
previous_application data
previous_application.head()
previous_application.columns.values
installments_payments data
installments_payments.head()
credit_card_balance data
credit_card_balance.head()
credit_card_balance.columns.values
bureau data
bureau.head()
checking missing data in application_train
# checking missing data
total = application_train.isnull().sum().sort_values(ascending = False)
percent = (application_train.isnull().sum()/application_train.isnull().count()*100).sort_values(ascending = False)
missing_application_train_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_application_train_data.head(20)
checking missing data in POS_CASH_balance
# checking missing data
total = POS_CASH_balance.isnull().sum().sort_values(ascending = False)
percent = (POS_CASH_balance.isnull().sum()/POS_CASH_balance.isnull().count()*100).sort_values(ascending = False)
missing_POS_CASH_balance_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_POS_CASH_balance_data.head(3)
checking missing data in bureau_balance
# checking missing data
total = bureau_balance.isnull().sum().sort_values(ascending = False)
percent = (bureau_balance.isnull().sum()/bureau_balance.isnull().count()*100).sort_values(ascending = False)
missing_bureau_balance_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_bureau_balance_data.head(3)
checking missing data in previous_application
# checking missing data
total = previous_application.isnull().sum().sort_values(ascending = False)
percent = (previous_application.isnull().sum()/previous_application.isnull().count()*100).sort_values(ascending = False)
missing_previous_application_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_previous_application_data.head(15)
checking missing data in installments_payments
# checking missing data
total = installments_payments.isnull().sum().sort_values(ascending = False)
percent = (installments_payments.isnull().sum()/installments_payments.isnull().count()*100).sort_values(ascending = False)
missing_installments_payments_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_installments_payments_data.head(3)
checking missing data in credit_card_balance
# checking missing data
total = credit_card_balance.isnull().sum().sort_values(ascending = False)
percent = (credit_card_balance.isnull().sum()/credit_card_balance.isnull().count()*100).sort_values(ascending = False)
missing_credit_card_balance_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_credit_card_balance_data.head(10)
checking missing data in bureau
# checking missing data
total = bureau.isnull().sum().sort_values(ascending = False)
percent = (bureau.isnull().sum()/bureau.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_bureau_data.head(8)
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(application_train["AMT_CREDIT"])
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_INCOME_TOTAL")
ax = sns.distplot(application_train["AMT_INCOME_TOTAL"].dropna())
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_GOODS_PRICE")
ax = sns.distplot(application_train["AMT_GOODS_PRICE"].dropna())
temp = application_train["NAME_TYPE_SUITE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "Who accompanied client when applying for the application in % ",
xaxis=dict(
title='Name of type of the Suite',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count of Name of type of the Suite in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='schoolStateNames')
temp = application_train["TARGET"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Loan Repayed or not')
temp = application_train["NAME_CONTRACT_TYPE"].value_counts()
fig = {
"data": [
{
"values": temp.values,
"labels": temp.index,
"domain": {"x": [0, .48]},
#"name": "Types of Loans",
#"hoverinfo":"label+percent+name",
"hole": .7,
"type": "pie"
},
],
"layout": {
"title":"Types of loan",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Loan Types",
"x": 0.17,
"y": 0.5
}
]
}
}
iplot(fig, filename='donut')
temp1 = application_train["FLAG_OWN_CAR"].value_counts()
temp2 = application_train["FLAG_OWN_REALTY"].value_counts()
fig = {
"data": [
{
"values": temp1.values,
"labels": temp1.index,
"domain": {"x": [0, .48]},
"name": "Own Car",
"hoverinfo":"label+percent+name",
"hole": .6,
"type": "pie"
},
{
"values": temp2.values,
"labels": temp2.index,
"text":"Own Realty",
"textposition":"inside",
"domain": {"x": [.52, 1]},
"name": "Own Reality",
"hoverinfo":"label+percent+name",
"hole": .6,
"type": "pie"
}],
"layout": {
"title":"Purpose of loan",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Own Car",
"x": 0.20,
"y": 0.5
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Own Realty",
"x": 0.8,
"y": 0.5
}
]
}
}
iplot(fig, filename='donut')
temp = application_train["NAME_INCOME_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Income sources of Applicant\'s', hole = 0.5)
temp = application_train["NAME_FAMILY_STATUS"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Family Status of Applicant\'s', hole = 0.5)
temp = application_train["OCCUPATION_TYPE"].value_counts()
temp.iplot(kind='bar', xTitle = 'Occupation', yTitle = "Count", title = 'Occupation of Applicant\'s who applied for loan', color = 'green')
temp = application_train["NAME_EDUCATION_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Education of Applicant\'s', hole = 0.5)
temp = application_train["NAME_HOUSING_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Type of House', hole = 0.5)
temp = application_train["ORGANIZATION_TYPE"].value_counts()
temp.iplot(kind='bar', xTitle = 'Organization Name', yTitle = "Count", title = 'Types of Organizations who applied for loan ', color = 'red')
temp = application_train["NAME_INCOME_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_INCOME_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_INCOME_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='YES'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='NO'
)
data = [trace1, trace2]
layout = go.Layout(
title = "Income sources of Applicant's in terms of loan is repayed or not in %",
#barmode='stack',
width = 1000,
xaxis=dict(
title='Income source',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_FAMILY_STATUS"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_FAMILY_STATUS"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_FAMILY_STATUS"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='YES'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='NO'
)
data = [trace1, trace2]
layout = go.Layout(
title = "Family Status of Applicant's in terms of loan is repayed or not in %",
#barmode='stack',
width = 1000,
xaxis=dict(
title='Family Status',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["OCCUPATION_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["OCCUPATION_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["OCCUPATION_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='YES'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='NO'
)
data = [trace1, trace2]
layout = go.Layout(
title = "Occupation of Applicant's in terms of loan is repayed or not in %",
#barmode='stack',
width = 1000,
xaxis=dict(
title='Occupation of Applicant\'s',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_EDUCATION_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_EDUCATION_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_EDUCATION_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='YES'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='NO'
)
data = [trace1, trace2]
layout = go.Layout(
title = "Education of Applicant's in terms of loan is repayed or not in %",
#barmode='stack',
width = 1000,
xaxis=dict(
title='Education of Applicant\'s',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_HOUSING_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_HOUSING_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_HOUSING_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='YES'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='NO'
)
data = [trace1, trace2]
layout = go.Layout(
title = "For which types of house higher applicant's applied for loan in terms of loan is repayed or not in %",
#barmode='stack',
width = 1000,
xaxis=dict(
title='types of house',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["ORGANIZATION_TYPE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["ORGANIZATION_TYPE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["ORGANIZATION_TYPE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='YES'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='NO'
)
data = [trace1, trace2]
layout = go.Layout(
title = "Types of Organizations in terms of loan is repayed or not in %",
#barmode='stack',
width = 1000,
xaxis=dict(
title='Types of Organizations',
tickfont=dict(
size=10,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = application_train["NAME_TYPE_SUITE"].value_counts()
#print(temp.values)
temp_y0 = []
temp_y1 = []
for val in temp.index:
temp_y1.append(np.sum(application_train["TARGET"][application_train["NAME_TYPE_SUITE"]==val] == 1))
temp_y0.append(np.sum(application_train["TARGET"][application_train["NAME_TYPE_SUITE"]==val] == 0))
trace1 = go.Bar(
x = temp.index,
y = (temp_y1 / temp.sum()) * 100,
name='YES'
)
trace2 = go.Bar(
x = temp.index,
y = (temp_y0 / temp.sum()) * 100,
name='NO'
)
data = [trace1, trace2]
layout = go.Layout(
title = "Distribution of Name of type of the Suite in terms of loan is repayed or not in %",
#barmode='stack',
width = 1000,
xaxis=dict(
title='Name of type of the Suite',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
temp = previous_application["NAME_CONTRACT_TYPE"].value_counts()
fig = {
"data": [
{
"values": temp.values,
"labels": temp.index,
"domain": {"x": [0, .48]},
#"name": "Types of Loans",
#"hoverinfo":"label+percent+name",
"hole": .7,
"type": "pie"
},
],
"layout": {
"title":"Contract product type of previous application",
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Contract product type",
"x": 0.12,
"y": 0.5
}
]
}
}
iplot(fig, filename='donut')
temp = previous_application["WEEKDAY_APPR_PROCESS_START"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
title = "On which day highest number of clients applied in prevoies application in % ",
xaxis=dict(
title='Day of the Week',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count of Day of Week in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='schoolStateNames')
temp = previous_application["NAME_CASH_LOAN_PURPOSE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Blues',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "Purpose of cash loan in previous application in % ",
xaxis=dict(
title='Purpose of cash loan',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_CONTRACT_STATUS"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Contract approved or not', hole = 0.5)
temp = previous_application["NAME_PAYMENT_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Payment method that client choose to pay for the previous application', hole = 0.7,colors=['#75e575', '#ea7c96',])
temp = previous_application["CODE_REJECT_REASON"].value_counts()
temp.iplot(kind='bar', xTitle = 'Reason', yTitle = "Count", title = 'Top reasons of previous application rejection(Wriiten in some codes)', colors=['#75e575'])
temp = previous_application["NAME_TYPE_SUITE"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = '#ea7c96',
#reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "Who accompanied client when applying for the previous application in % ",
xaxis=dict(
title='Name of type of the Suite',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count of Name of type of the Suite in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_CLIENT_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Was the client old or new client when applying for the previous application', hole = 0.7,colors=['#ea7c96','#75e575',])
temp = previous_application["NAME_GOODS_CATEGORY"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Greens',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "What kind of goods did the client apply for in the previous application in % ",
xaxis=dict(
title='Name of the goods',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_PORTFOLIO"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Was the previous application for CASH, POS, CAR, …', hole = 0.7,colors=['#ea7c96','#75e575',])
temp = previous_application["NAME_PRODUCT_TYPE"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Was the previous application x-sell or walk-in', hole = 0.7)
temp = previous_application["CHANNEL_TYPE"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
# colorscale = 'Violet',
# reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "Top channels through which they acquired the client on the previous application in % ",
xaxis=dict(
title='Name of the Channel',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_SELLER_INDUSTRY"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Greens',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "Top industry of the seller in % ",
xaxis=dict(
title='Industry Name',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["NAME_YIELD_GROUP"].value_counts()
trace = go.Bar(
x = temp.index,
y = (temp / temp.sum())*100,
marker=dict(
color=(temp / temp.sum())*100,
colorscale = 'Greens',
reversescale = True
),
)
data = [trace]
layout = go.Layout(
title = "Grouped interest rate into small medium and high of the previous application in % ",
xaxis=dict(
title='Grouped Interest rate',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Count in %',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
temp = previous_application["PRODUCT_COMBINATION"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Top Detailed product combination of the previous application', hole = 0.7)
temp = previous_application["NFLAG_INSURED_ON_APPROVAL"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
df.iplot(kind='pie',labels='labels',values='values', title='Did the client requested insurance during the previous application(YES : 1, NO : 0)', hole = 0.7, colors=['#ea7c96','#75e575',])
data = [
go.Heatmap(
z= application_train.corr().values,
x=application_train.columns.values,
y=application_train.columns.values,
colorscale='Viridis',
reversescale = False,
text = True ,
opacity = 1.0 )
]
layout = go.Layout(
title='Pearson Correlation of features',
xaxis = dict(ticks='', nticks=36),
yaxis = dict(ticks='' ),
width = 900, height = 700,
margin=dict(
l=240,
),)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')
from sklearn import preprocessing
categorical_feats = [
f for f in application_train.columns if application_train[f].dtype == 'object'
]
for col in categorical_feats:
lb = preprocessing.LabelEncoder()
lb.fit(list(application_train[col].values.astype('str')) + list(application_test[col].values.astype('str')))
application_train[col] = lb.transform(list(application_train[col].values.astype('str')))
application_test[col] = lb.transform(list(application_test[col].values.astype('str')))
application_train.fillna(-999, inplace = True)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=8, min_samples_leaf=4, max_features=0.5, random_state=2018)
rf.fit(application_train.drop(['SK_ID_CURR', 'TARGET'],axis=1), application_train.TARGET)
features = application_train.drop(['SK_ID_CURR', 'TARGET'],axis=1).columns.values
x, y = (list(x) for x in zip(*sorted(zip(rf.feature_importances_, features),
reverse = False)))
trace2 = go.Bar(
x=x ,
y=y,
marker=dict(
color=x,
colorscale = 'Viridis',
reversescale = True
),
name='Random Forest Feature importance',
orientation='h',
)
layout = dict(
title='Barplot of Feature importances',
width = 900, height = 2000,
yaxis=dict(
showgrid=False,
showline=False,
showticklabels=True,
# domain=[0, 0.85],
),
margin=dict(
l=300,
),
)
fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
py.iplot(fig1, filename='plots')